# -*- coding: utf-8 -*-
# filename: handle_wa.py

import web
from urllib2 import urlopen # python 2.x
# from urllib.request import urlopen #python 3.x
from urllib import urlretrieve
from lxml import etree
import re
import os
import threading

class Handle(object):
    global page
    page = '''
<html>
<head><title>WebAgent</title></head>
<body>
<form action="/" method="get">
Website: <input type="text" name="website" />
<button type="submit" value="Submit">Submit</button>
<button type="reset" value="Reset">Reset</button>
</form>
</body>
</html>
'''
    def GET(self):
        try:
            webData = web.input()
            # print webData, type(webData)
            if webData:
                rUrl = webData['website']
                if re.search(r'([\w-]+\.)+[\w-]+(/[\w./?%&=]*)?$', rUrl) is not None:
                    if re.search(r'^http://.*', rUrl) is not None:
                        rUrl = rUrl[len("http://"):]
                print "<Url>", rUrl

                uc = urlopen("http://" + rUrl)
                ps = uc.read()
                uc.close()
                html = etree.HTML(ps)
                nodes = html.xpath("//*[@href | @src]")  # most of @href are .html and .css, @src are .js, .jpg and .png

                sfThreadList = []  # 线程列表
                sp = "./_{}/".format(rUrl.split('/')[0].replace('.', '_'))  # 保存路径不设置子目录
                # sp = "./_{}/".format(rUrl.replace('.', '_'))
                if not os.path.exists(sp):
                    os.makedirs(sp) # 新建文件存储目录

                for node in nodes:
                    keys = node.keys()
                    values = node.values()
                    for index in range(len(keys)):
                        if keys[index] == "href" or keys[index] == "src":
                            value = re.search(ur'[_\w\.-]+(/[_\w-]+)*/([\u4e00-\u9fa5]|[-_=,&\w])+\.\w+(\?[&%=\w-]+)?$', values[index])  # 匹配普通文件地址、带？文件地址、带=，&文件地址、带中文文件地址
                            if value:
                                tValue = re.search(ur'^[_\w-]+(/[_\w-]+)*/([\u4e00-\u9fa5]|[-_=,&\w])+\.\w+', values[index])  # 匹配不带完整网址的路径，不包含带？路径
                                if tValue is not None:
                                    value = rUrl + '/' + tValue.group()  # 补全路径
                                else:
                                    value = value.group().split('?')[0]  # 带完整网址的路径除去问号
                                    # print values[index], value
                                try:
                                    value = value.encode('utf-8')
                                    values[index] = values[index].encode('utf-8')
                                    fn = value.split('/')[-1]
                                    fp = sp + fn
                                    # print fn, fp
                                    # urlretrieve("http://" + value, fp)  # 保存文件
                                    sfThreadList.append(
                                        threading.Thread(target=urlretrieve, args=("http://" + value, fp)))  # 多线程保存文件
                                    # ps = ps.replace(values[index], fp)  # 修改HTML文件里的路径
                                    ps = ps.replace(values[index], fn)  # 修改HTML文件里的路径
                                except Exception, err:
                                    print ">>>Save>>>", value, ">>>", err
                with open("{}index.html".format(sp), "wb") as fs:
                    fs.write(page + ps)
                    # print "index.html"
                try:
                    for t in sfThreadList:
                        t.start()
                    for t in sfThreadList:
                        t.join()
                except Exception, err:
                    print ">>>Thread>>>", err

                # return "<script>window.location.href='./{}index.html';</script>".format(sp)
                web.seeother("{}index.html".format(sp))
            else:
                return page
        except Exception,err:
            print ">>>Error>>>", err
            return page
    def POST(self):
        return page


